Label = ob¶
Dados¶
[ ]:
df_SP = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_SP_labels.csv')
df_fora = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_foraSP_labels.csv')
(806402, 94)
(62317, 94)
SP
[ ]:
df_SP.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA 717377
delta_t6 717377
delta_t5 717377
delta_t4 717377
IDADE 1
TRATAMENTO 0
dtype: int64
[ ]:
corr_matrix = df_SP.corr()
abs(corr_matrix['ob']).sort_values(ascending = False).head(20)
ob 1.000000
vivo_sem_rec 0.946549
ob_sem_rec 0.870247
ULTINFO 0.868236
ano_ob 0.750408
PERDASEG 0.378763
ob_com_rec 0.319736
ANODIAG 0.269601
CIRURGIA 0.249248
QUIMIO 0.244519
delta_t7 0.216269
delta_t8 0.216105
delta_t9 0.212633
CATEATEND 0.212081
RECNENHUM 0.196724
delta_t5 0.169360
delta_t4 0.167767
delta_t6 0.160751
RECREGIO 0.141924
IDADE 0.136383
Name: ob, dtype: float64
[ ]:
df_SP.ob.value_counts()
0 480724
1 325678
Name: ob, dtype: int64
[ ]:
df_SP['ob'][df_SP.PERDASEG == 1].value_counts()
0 140925
1 1
Name: ob, dtype: int64
Fora de SP
[ ]:
df_fora.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA 57799
delta_t6 57799
delta_t5 57799
delta_t4 57799
NAOTRAT 0
LOCALTNM 0
dtype: int64
[ ]:
corr_matrix = df_fora.corr()
abs(corr_matrix['ob']).sort_values(ascending = False).head(20)
ob 1.000000
vivo_sem_rec 0.949190
ob_sem_rec 0.928200
ULTINFO 0.854305
ano_ob 0.772548
PERDASEG 0.359090
ob_com_rec 0.275720
CIRURGIA 0.257715
QUIMIO 0.253681
ANODIAG 0.252079
CATEATEND 0.209850
delta_t5 0.168003
delta_t8 0.167875
delta_t4 0.166380
delta_t7 0.165922
delta_t9 0.164855
delta_t6 0.163108
RECNENHUM 0.149262
RADIO 0.139862
GLEASON 0.104262
Name: ob, dtype: float64
[ ]:
df_fora.ob.value_counts()
0 44591
1 17726
Name: ob, dtype: int64
[ ]:
df_fora['ob'][df_fora.PERDASEG == 1].value_counts()
0 15263
Name: ob, dtype: int64
Divisão em treino e teste¶
[ ]:
list_drop = ['UFRESID', 'DTCONSULT', 'DTDIAG', 'DTTRAT', 'DTRECIDIVA', 'DTULTINFO',
'IDADE', 'PERDASEG', 'CONSDIAG', 'TRATCONS', 'DIAGTRAT', 'delta_t4',
'delta_t5', 'delta_t6', 'delta_t7', 'delta_t8', 'delta_t9', 'ano_ob',
'ob_com_rec', 'ob_sem_rec', 'vivo_com_rec', 'vivo_sem_rec', 'ULTINFO']
lb = 'ob'
SP
[ ]:
X_trainSP, X_testSP, y_trainSP, y_testSP = get_train_test(df_SP, list_drop, lb)
X_train = (604801, 70), X_test = (201601, 70)
y_train = (604801,), y_test = (201601,)
Fora de SP
[ ]:
X_trainFora, X_testFora, y_trainFora, y_testFora = get_train_test(df_fora, list_drop, lb)
X_train = (46737, 70), X_test = (15580, 70)
y_train = (46737,), y_test = (15580,)
Encoder e normalização¶
SP
[ ]:
X_trainSP_enc, enc_SP, norm_SP = train_preprocessing(X_trainSP, normalizer='StandardScaler')
Fora de SP
[ ]:
X_trainFora_enc, enc_fora, norm_fora = train_preprocessing(X_trainFora, normalizer='StandardScaler')
PCA¶
SP
[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainSP_enc)
[ ]:
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=np.linspace(1, 70, 70),
y=np.cumsum(pca.explained_variance_ratio_),
line_shape='hv',
))
fig.add_trace(
go.Bar(
x=np.linspace(1, 70, 70),
y=pca.explained_variance_ratio_
))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')
fig.show()
Fora de SP
[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainFora_enc)
[ ]:
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=np.linspace(1, 70, 70),
y=np.cumsum(pca.explained_variance_ratio_),
line_shape='hv',
))
fig.add_trace(
go.Bar(
x=np.linspace(1, 70, 70),
y=pca.explained_variance_ratio_
))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')
fig.show()
Balanceamento dos dados¶
SP
[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainSP.value_counts()
0 360572
1 244229
Name: ob, dtype: int64
[ ]:
rus = RandomUnderSampler(random_state=seed)
X_SP, y_SP = rus.fit_sample(X_trainSP_enc, y_trainSP)
[ ]:
y_SP.shape
(488458,)
Fora de SP
[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainFora.value_counts()
0 33365
1 13372
Name: ob, dtype: int64
[ ]:
X_fora, y_fora = SMOTE(random_state=seed).fit_resample(X_trainFora_enc, y_trainFora)
[ ]:
y_fora.shape
(66730,)
Treinamento dos modelos de Machine Learning¶
Random Forest¶
[ ]:
# SP
rf_sp = RandomForestClassifier(random_state=seed)
rf_sp.fit(X_SP, y_SP)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=None, max_features='auto',
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=100,
n_jobs=None, oob_score=False, random_state=10, verbose=0,
warm_start=False)
[ ]:
# Fora
rf_fora = RandomForestClassifier(random_state=seed, class_weight={0: 10, 1: 0.0001})
rf_fora.fit(X_fora, y_fora)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
class_weight={0: 10, 1: 0.0001}, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
max_samples=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=None, oob_score=False,
random_state=10, verbose=0, warm_start=False)
XGBoost¶
[ ]:
# SP
xgboost_sp = xgb.XGBClassifier(max_depth=15, random_state=seed)
xgboost_sp.fit(X_SP, y_SP)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.1, max_delta_step=0, max_depth=15,
min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=10,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
silent=None, subsample=1, verbosity=1)
[ ]:
# Fora de SP
xgboost_fora = xgb.XGBClassifier(max_depth=15, scale_pos_weight=10,
random_state=seed)
xgboost_fora.fit(X_fora, y_fora)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.1, max_delta_step=0, max_depth=15,
min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=10,
reg_alpha=0, reg_lambda=1, scale_pos_weight=10, seed=None,
silent=None, subsample=1, verbosity=1)
Salvando modelos¶
[ ]:
#with open('/content/drive/MyDrive/Trabalho/Cancer/Modelos/models_SP.pkl', 'wb') as arq:
#pickle.dump(
#{'X_train': X_SP, 'y_train': y_SP, 'Encoders': enc_SP,
#'Normalizer': norm_SP, 'Random Forest': rf_sp, 'XGB': xgboost_sp}, arq)
[ ]:
#with open('/content/drive/MyDrive/Trabalho/Cancer/Modelos/models_foraSP.pkl', 'wb') as arq:
#pickle.dump(
#{'X_train': X_fora, 'y_train': y_fora, 'Encoders': enc_fora,
#'Normalizer': norm_fora, 'Random Forest': rf_fora, 'XGB': xgboost_fora}, arq)
Validação dos modelos¶
Pré-processamento dos dados de teste¶
[ ]:
# SP
X_testSP_ = test_preprocessing(X_testSP, enc_SP, norm_SP)
[ ]:
# Fora de SP
X_testFora_ = test_preprocessing(X_testFora, enc_fora, norm_fora)
Random Forest¶
[ ]:
plot_confusion_matrix(rf_sp, X_testSP_, y_testSP, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
[ ]:
plot_feat_importances(rf_sp, X_testSP)
[ ]:
plot_confusion_matrix(rf_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
[ ]:
plot_feat_importances(rf_fora, X_testFora)
XGBoost¶
[ ]:
plot_confusion_matrix(xgboost_sp, X_testSP_, y_testSP, cmap='Blues', normalize='true', values_format='.2f')
plt.show()
[ ]:
plot_feat_importances(xgboost_sp, X_testSP)
[ ]:
plot_confusion_matrix(xgboost_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
[ ]:
plot_feat_importances(xgboost_fora, X_testFora)
Label = RECNENHUM¶
Dados¶
[ ]:
df_SP = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_SP_labels.csv')
df_fora = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_foraSP_labels.csv')
(806402, 94)
(62317, 94)
SP
[ ]:
df_SP.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA 717377
delta_t6 717377
delta_t5 717377
delta_t4 717377
IDADE 1
TRATAMENTO 0
dtype: int64
[ ]:
corr_matrix = df_SP.corr()
abs(corr_matrix['RECNENHUM']).sort_values(ascending = False).head(20)
RECNENHUM 1.000000
ob_com_rec 0.829363
RECLOCAL 0.679652
RECREGIO 0.529338
vivo_com_rec 0.522025
RECDIST 0.497837
vivo_sem_rec 0.364914
ano_ob 0.242833
ob_sem_rec 0.227291
ob 0.196724
QUIMIO 0.147652
RADIO 0.085910
ULTINFO 0.081964
PERDASEG 0.070304
delta_t8 0.064934
delta_t7 0.063559
delta_t9 0.063466
IBGEATEN 0.062526
delta_t5 0.058940
delta_t4 0.055412
Name: RECNENHUM, dtype: float64
Fora de SP
[ ]:
df_fora.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA 57799
delta_t6 57799
delta_t5 57799
delta_t4 57799
NAOTRAT 0
LOCALTNM 0
dtype: int64
[ ]:
corr_matrix = df_fora.corr()
abs(corr_matrix['RECNENHUM']).sort_values(ascending = False).head(20)
RECNENHUM 1.000000
ob_com_rec 0.749663
RECLOCAL 0.695533
vivo_com_rec 0.642191
RECDIST 0.579667
RECREGIO 0.430531
vivo_sem_rec 0.349104
ano_ob 0.217344
ob 0.149262
ob_sem_rec 0.135709
QUIMIO 0.130924
delta_t8 0.104972
delta_t9 0.103801
delta_t7 0.103667
delta_t5 0.087167
delta_t4 0.082168
delta_t6 0.081259
IDADE 0.074725
RADIO 0.074704
DIAGPREV 0.062018
Name: RECNENHUM, dtype: float64
Divisão em treino e teste¶
[ ]:
df_SP.RECNENHUM.value_counts()
1 732633
0 73769
Name: RECNENHUM, dtype: int64
[ ]:
n_samples = 400000
df_SP_rec = df_SP[df_SP.RECNENHUM == 1].sample(n_samples, random_state=seed).sort_index()
df_SP_sem_rec = df_SP[df_SP.RECNENHUM == 0]
df_SP_menor = pd.concat([df_SP_rec, df_SP_sem_rec]).sort_index()
df_SP_menor.RECNENHUM.value_counts()
1 400000
0 73769
Name: RECNENHUM, dtype: int64
[ ]:
df_fora.RECNENHUM.value_counts()
1 59137
0 3180
Name: RECNENHUM, dtype: int64
[ ]:
list_drop = ['UFRESID', 'DTCONSULT', 'DTDIAG', 'DTTRAT', 'DTRECIDIVA', 'DTULTINFO',
'IDADE', 'PERDASEG', 'CONSDIAG', 'TRATCONS', 'DIAGTRAT', 'RECLOCAL',
'RECREGIO', 'RECDIST', 'REC01', 'REC02', 'REC03', 'REC04', 'delta_t4',
'delta_t5', 'delta_t6', 'delta_t7', 'delta_t8', 'delta_t9', 'ob', 'ano_ob',
'ob_com_rec', 'ob_sem_rec', 'vivo_com_rec', 'vivo_sem_rec', 'ULTINFO']
label = 'RECNENHUM'
SP
[ ]:
X_trainSP, X_testSP, y_trainSP, y_testSP = get_train_test(df_SP_menor, list_drop, label)
X_train = (355326, 62), X_test = (118443, 62)
y_train = (355326,), y_test = (118443,)
Fora de SP
[ ]:
X_trainFora, X_testFora, y_trainFora, y_testFora = get_train_test(df_fora, list_drop, label)
X_train = (46737, 62), X_test = (15580, 62)
y_train = (46737,), y_test = (15580,)
Encoder e normalização¶
SP
[ ]:
X_trainSP_enc, enc_SP, norm_SP = train_preprocessing(X_trainSP, normalizer='StandardScaler')
Fora de SP
[ ]:
X_trainFora_enc, enc_fora, norm_fora = train_preprocessing(X_trainFora, normalizer='StandardScaler')
PCA¶
SP
[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainSP_enc)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=np.linspace(1, X_trainSP_enc.shape[0], X_trainSP_enc.shape[0]),
y=np.cumsum(pca.explained_variance_ratio_),
line_shape='hv',
))
fig.add_trace(
go.Bar(
x=np.linspace(1, X_trainSP_enc.shape[0], X_trainSP_enc.shape[0]),
y=pca.explained_variance_ratio_
))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')
fig.show()
Fora de SP
[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainFora_enc)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=np.linspace(1, X_trainFora_enc.shape[0], X_trainFora_enc.shape[0]),
y=np.cumsum(pca.explained_variance_ratio_),
line_shape='hv',
))
fig.add_trace(
go.Bar(
x=np.linspace(1, X_trainFora_enc.shape[0], X_trainFora_enc.shape[0]),
y=pca.explained_variance_ratio_
))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')
fig.show()
Balanceamento dos dados¶
SP
[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainSP.value_counts()
1 300057
0 55269
Name: RECNENHUM, dtype: int64
[ ]:
X_SP, y_SP = SMOTE(random_state=seed).fit_resample(X_trainSP_enc, y_trainSP)
[ ]:
y_SP.shape
(600114,)
Fora de SP
[ ]:
# Quantidade de valores para cada label nos dados de treino
y_trainFora.value_counts()
1 44367
0 2370
Name: RECNENHUM, dtype: int64
[ ]:
X_fora, y_fora = SMOTE(random_state=seed).fit_resample(X_trainFora_enc, y_trainFora)
[ ]:
y_fora.shape
(88734,)
Treinamento dos modelos de Machine Learning¶
Random Forest¶
[ ]:
# SP
rf_sp = RandomForestClassifier(random_state=seed,
class_weight={0: 20, 1: 0.00006})
rf_sp.fit(X_SP, y_SP)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
class_weight={0: 20, 1: 6e-05}, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
max_samples=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=None, oob_score=False,
random_state=10, verbose=0, warm_start=False)
[ ]:
# Fora
rf_fora = RandomForestClassifier(random_state=seed,
class_weight={0: 24, 1: 0.000022})
rf_fora.fit(X_fora, y_fora)
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
class_weight={0: 24, 1: 2.2e-05}, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
max_samples=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
n_estimators=100, n_jobs=None, oob_score=False,
random_state=10, verbose=0, warm_start=False)
XGBoost¶
[ ]:
# SP
xgboost_sp = xgb.XGBClassifier(max_depth=15,
scale_pos_weight=0.15,
random_state=seed)
xgboost_sp.fit(X_SP, y_SP)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.1, max_delta_step=0, max_depth=15,
min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=10,
reg_alpha=0, reg_lambda=1, scale_pos_weight=0.15, seed=None,
silent=None, subsample=1, verbosity=1)
[ ]:
# Fora de SP
xgboost_fora = xgb.XGBClassifier(max_depth=15,
scale_pos_weight=0.02, # 0.05 e 0.01
random_state=seed)
xgboost_fora.fit(X_fora, y_fora)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0,
learning_rate=0.1, max_delta_step=0, max_depth=15,
min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
nthread=None, objective='binary:logistic', random_state=10,
reg_alpha=0, reg_lambda=1, scale_pos_weight=0.02, seed=None,
silent=None, subsample=1, verbosity=1)
Validação dos modelos¶
Pré-processamento dos dados de teste¶
[ ]:
# SP
X_testSP_ = test_preprocessing(X_testSP, enc_SP, norm_SP)
[ ]:
# Fora de SP
X_testFora_ = test_preprocessing(X_testFora, enc_fora, norm_fora)
Random Forest¶
[ ]:
plot_confusion_matrix(rf_sp, X_testSP_, y_testSP, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
[ ]:
plot_feat_importances(rf_sp, X_testSP)
[ ]:
plot_confusion_matrix(rf_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
[ ]:
plot_feat_importances(rf_fora, X_testFora)
XGBoost¶
[ ]:
plot_confusion_matrix(xgboost_sp, X_testSP_, y_testSP, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
[ ]:
plot_feat_importances(xgboost_sp, X_testSP)
[ ]:
plot_confusion_matrix(xgboost_fora, X_testFora_, y_testFora, cmap='Blues', normalize="true", values_format='.2f')
plt.show()
[ ]:
plot_feat_importances(xgboost_fora, X_testFora)
Label = RECDIST¶
Dados¶
[ ]:
df_SP = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_SP_labels.csv')
df_fora = read_csv('/content/drive/MyDrive/Trabalho/Cancer/Datasets/cancer_foraSP_labels.csv')
(806402, 94)
(62317, 94)
SP
[ ]:
df_SP.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA 717377
delta_t6 717377
delta_t5 717377
delta_t4 717377
IDADE 1
TRATAMENTO 0
dtype: int64
[ ]:
corr_matrix = df_SP.corr()
abs(corr_matrix['RECDIST']).sort_values(ascending = False).head(20)
RECDIST 1.000000
RECNENHUM 0.497837
ob_com_rec 0.427203
vivo_com_rec 0.238029
vivo_sem_rec 0.181668
ano_ob 0.119045
ob_sem_rec 0.113154
QUIMIO 0.109437
ob 0.105117
CATEATEND 0.093630
ANODIAG 0.084918
LATERALI 0.081624
RECLOCAL 0.058266
PERDASEG 0.055281
HORMONIO 0.045904
RRAS 0.043732
IBGEATEN 0.043044
RADIO 0.041206
delta_t5 0.040047
RADIOAPOS 0.038809
Name: RECDIST, dtype: float64
Fora de SP
[ ]:
df_fora.isna().sum().sort_values(ascending=False).head(6)
DTRECIDIVA 57799
delta_t6 57799
delta_t5 57799
delta_t4 57799
NAOTRAT 0
LOCALTNM 0
dtype: int64
[ ]:
corr_matrix = df_fora.corr()
abs(corr_matrix['RECDIST']).sort_values(ascending = False).head(20)
RECDIST 1.000000
RECNENHUM 0.579667
ob_com_rec 0.459205
vivo_com_rec 0.343707
vivo_sem_rec 0.202364
ano_ob 0.120521
QUIMIO 0.107406
ob 0.095742
CATEATEND 0.087712
ob_sem_rec 0.078666
LATERALI 0.073880
DIAGPREV 0.071119
ANODIAG 0.067613
RADIO 0.062403
RECLOCAL 0.050995
PERDASEG 0.047730
HORMONIO 0.046865
delta_t5 0.030107
DIAGTRAT 0.027065
delta_t2 0.027065
Name: RECDIST, dtype: float64
Divisão em treino e teste¶
[ ]:
df_SP.RECDIST.value_counts()
0 786768
1 19634
Name: RECDIST, dtype: int64
[ ]:
n_samples = 400000
df_SP_sem_rec = df_SP[df_SP.RECDIST == 0].sample(n_samples, random_state=seed).sort_index()
df_SP_rec = df_SP[df_SP.RECDIST == 1]
df_SP_menor = pd.concat([df_SP_rec, df_SP_sem_rec]).sort_index()
df_SP_menor.RECDIST.value_counts()
0 400000
1 19634
Name: RECDIST, dtype: int64
[ ]:
df_fora.RECDIST.value_counts()
0 61211
1 1106
Name: RECDIST, dtype: int64
[ ]:
list_drop = ['UFRESID', 'DTCONSULT', 'DTDIAG', 'DTTRAT', 'DTRECIDIVA', 'DTULTINFO',
'IDADE', 'PERDASEG', 'CONSDIAG', 'TRATCONS', 'DIAGTRAT', 'RECNENHUM',
'RECLOCAL', 'RECREGIO', 'REC01', 'REC02', 'REC03', 'REC04', 'delta_t4',
'delta_t5', 'delta_t6', 'delta_t7', 'delta_t8', 'delta_t9', 'ob', 'ano_ob',
'ob_com_rec', 'ob_sem_rec', 'vivo_com_rec', 'vivo_sem_rec', 'ULTINFO']
lb = 'RECDIST'
SP
[ ]:
X_trainSP, X_testSP, y_trainSP, y_testSP = get_train_test(df_SP_menor, list_drop, lb)
X_train = (314725, 62), X_test = (104909, 62)
y_train = (314725,), y_test = (104909,)
Fora de SP
[ ]:
X_trainFora, X_testFora, y_trainFora, y_testFora = get_train_test(df_fora, list_drop, lb)
X_train = (46737, 62), X_test = (15580, 62)
y_train = (46737,), y_test = (15580,)
Encoder e normalização¶
SP
[ ]:
X_trainSP_enc, enc_SP, norm_SP = train_preprocessing(X_trainSP, normalizer='StandardScaler')
Fora de SP
[ ]:
X_trainFora_enc, enc_fora, norm_fora = train_preprocessing(X_trainFora, normalizer='StandardScaler')
PCA¶
SP
[ ]:
pca = PCA()
principalComponents = pca.fit_transform(X_trainSP_enc)
fig = go.Figure()
fig.add_trace(
go.Scatter(
x=np.linspace(1, X_trainSP_enc.shape[0], X_trainSP_enc.shape[0]),
y=np.cumsum(pca.explained_variance_ratio_),
line_shape='hv',
))
fig.add_trace(
go.Bar(
x=np.linspace(1, X_trainSP_enc.shape[0], X_trainSP_enc.shape[0]),
y=pca.explained_variance_ratio_
))
fig.update_layout(yaxis_title='Variância que pode ser "explicada"', xaxis_title='Número de componentes principais')
fig.show()